from docx import Document
from lxml import etree

# 定义Word文档的XML命名空间
NS = {
    "w": "http://schemas.openxmlformats.org/wordprocessingml/2006/main"
}


def extract_word_tables(doc_path):
    """提取 Word 表格（含合并单元格处理）"""
    doc = Document(doc_path)
    tables_data = []

    for table in doc.tables:
        table_data = []
        for row in table.rows:
            row_data = []
            col_idx = 0
            max_col = len(row.cells)

            while col_idx < max_col:
                cell = row.cells[col_idx]
                tc = cell._tc
                root = etree.fromstring(tc.xml)

                # 处理横向合并
                grid_span = 1
                grid_span_elem = root.xpath(".//w:gridSpan", namespaces=NS)
                if grid_span_elem:
                    grid_span = int(grid_span_elem[0].get(f'{{{NS["w"]}}}val', 1))

                # 去除单元格文本中的换行符
                cell_text = cell.text if cell.text else ""
                cleaned_text = cell_text.replace('\n', '')
                row_data.append(cleaned_text.strip())
                col_idx += grid_span

            table_data.append(row_data)
        tables_data.append(table_data)

    return tables_data
